library(tidyverse)
library(base)
library(patchwork)
library(scales)
library(ggridges)
library(patchwork)
library(ggseqlogo)
library(dplyr)
library(table1)
library(quarto)Logistic modelling
Load Data
Load packages
# Create the data directory programmatically
#dir.create(path="data")
# _raw data folder, filename, and original data location
raw_dir <- "../data/_raw/"
data_file <- "rhc.csv"
data_loc <- "https://hbiostat.org/data/repo/"
# Checking if file exist in folder
if( !dir.exists(raw_dir) ){
dir.create(path = raw_dir)
}
if( !file.exists(str_c(raw_dir, data_file)) ){
download.file(
url = str_c(data_loc, data_file),
destfile = str_c(raw_dir, data_file))
}
rhc_data <- read_csv(file = str_c(raw_dir, data_file))
print(str_c(raw_dir,data_file))[1] "../data/_raw/rhc.csv"
# Write the data to data folder
write_csv(x = rhc_data,
file = "../data/rhc.csv")quarto::quarto_render(input = "01_load.qmd",
output_format = "html")[31m
processing file: 01_load.qmd
[39m1/4
2/4 [Load-librariess]
3/4
4/4 [Load-file-and-write-file]
[31moutput file: 01_load.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :[39m[31m
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 01_load.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 01_load.html
file.rename(from = "01_load.html",
to = "../results/01_load.html")[1] TRUE
Clean Data
Clean data
library(tidyverse)
library(base)
library(patchwork)
library(scales)
library(ggridges)
library(patchwork)
library(ggseqlogo)
library(dplyr)
library(table1)
rhc_data <- read_csv("../data/rhc.csv")
#| label: Simple-cleaning-chunk-data-wrangling
#copy paste
rhc_clean <- rhc_data |>
bind_cols() |>
as_tibble()
# convert death string to binary: 1 = dead 0 = alive
rhc_clean <- rhc_clean |>
mutate(death = case_when(
death == "No" ~ 0,
death == "Yes" ~ 1))
#Change RHC strings to factor
rhc_clean<- rhc_clean |>
mutate(swang1 = case_when(
swang1 == "No RHC" ~ 0,
swang1 == "RHC" ~ 1))
#round age to the nearest whole number
rhc_clean <- rhc_clean |>
rename(weight = wtkilo1) |>
mutate(age = round(age, 0),
weight = round(weight, 0),
edu = round(edu, 0),
temp1 = round(temp1, 1),
ph1 = round(ph1, 2))
#Remove old ptid and create new ID's
rhc_clean <- rhc_clean |>
select(-c(ptid),
-ends_with("dte"),
-adld3p) |>
rename(ptID= "...1")Save data
quarto::quarto_render(input = "02_clean.qmd",
output_format = "html")[31m
processing file: 02_clean.qmd
[39m1/5
2/5 [Load-libraries]
3/5
4/5 [Writing-new-cleaned-csv]
5/5
[31moutput file: 02_clean.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 02_clean.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 02_clean.html
file.rename(from = "02_clean.html",
to = "../results/02_clean.html")[1] TRUE
Augment Data
Load packages
library(tidyverse)
library(base)
library(patchwork)
library(scales)
library(ggridges)
library(patchwork)
library(ggseqlogo)
library(dplyr)
library(table1)#Augumented data ::: {.cell}
create_binary_column <- function(x) {
ifelse(x == "Yes", 1,
ifelse(x == "No", 0, x))
}
rhc_clean <- read_csv("../data/rhc_clean.csv")Rows: 5735 Columns: 57
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (19): cat1, cat2, ca, sex, dth30, dnr1, ninsclas, resp, card, neuro, gas...
dbl (38): ptID, death, cardiohx, chfhx, dementhx, psychhx, chrpulhx, renalhx...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Creates one column with all diseases that is reason for admission to the hospital, instead of them all being present for all patients.
rhc_clean_long <- rhc_clean |>
mutate(across(resp:ortho,
~create_binary_column(.))) |>
pivot_longer(cols = resp:ortho,
names_to = "Diagnosis",
values_to = "Values")|>
filter(Values==1) |>
group_by(ptID) |>
mutate(Diagnosis = paste(unique(Diagnosis),
collapse = ",")) |>
ungroup()
#makes one value for patients with multiple diagnosis as reason for admission and delete repeated columns
rhc_aug <- rhc_clean_long |>
mutate(Diagnosis = case_when(str_detect(Diagnosis,
pattern = ",",
negate = FALSE) == TRUE ~ "multiple diagnosis",
str_detect(Diagnosis,
pattern = ",",
negate = FALSE) == FALSE ~ Diagnosis )) |>
unique.data.frame() |>
mutate(aps1_Interval = cut(aps1,
breaks = c(0, 15, 30, 45, 60, 75, 90,
105, 120, 135, 150),
labels = c("0-15", "15-30", "30-45",
"45-60", "60-75", "75-90",
"90-105","105-120", "120-135",
"135-150"))) |>
mutate(age_group = cut(age,
breaks = c(0, 10, 20, 30, 40, 50,
60, 70, 80, 90, Inf),
labels = c("<10", "10-20", "20-30",
"30-40", "40-50", "50-60",
"60-70", "70-80", "80-90",
"> 90"),
include.lowest = TRUE,
na.rm = TRUE)) |>
mutate(edu_group = cut(edu,
breaks = c(-Inf, 10, 20, Inf),
labels = c("<10 years",
"between 10-20 years",
">20 years"),
include.lowest = TRUE,
na.rm = TRUE))
#Add a column with the number of comorbidities each patient has.
rhc_aug <- rhc_aug |>
rowwise() |>
mutate(comorbidities = sum(c_across(cardiohx:amihx)))
#simulated seperated and joined datatable
rhc_split1 <- rhc_aug |>
select(-c(edu_group, comorbidities))
rhc_split2 <- rhc_aug |>
select(c(ptID, edu_group, comorbidities))
rhc_aug <- inner_join(x = rhc_split1,
y = rhc_split2,
by ="ptID"):::
Save data
# Write the data to disk
write_csv(x = rhc_aug,
file = "../data/rhc_aug.csv")quarto::quarto_render(input = "03_aug.qmd",
output_format = "html")[31m
processing file: 03_aug.qmd
[39m1/7
2/7 [unnamed-chunk-1]
3/7
4/7 [Augumenting-initiation.]
5/7
6/7 [Writing-new-augmented-csv]
7/7
[31moutput file: 03_aug.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 03_aug.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 03_aug.html
file.rename(from = "03_aug.html",
to = "../results/03_aug.html")[1] TRUE
Description
Load packages
library(tidyverse)
library(table1)The dataset we have used comes from Connors et al. (1996): The effectiveness of RHC in the initial care of critically ill patients. It was publicly available on hbiostat.org and downloaded as a cvs file. The dataset contains information about patients admitted to the hospital in order to study the effect of right heart catherization (RHC). There are total of 2184 patients in the “disease” group which recieved RHC and 3551 patients in the “control” group without RHC. The control group was artificially created using a propensity matching score. The dataset contains 5735 rows and 63 columns.
rhc_data <- read_csv("../data/rhc.csv")Rows: 5735 Columns: 63
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (22): cat1, cat2, ca, death, sex, dth30, swang1, dnr1, ninsclas, resp, c...
dbl (41): ...1, sadmdte, dschdte, dthdte, lstctdte, cardiohx, chfhx, dementh...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rhc_aug <- read_csv("../data/rhc_aug.csv")Rows: 5612 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): cat1, cat2, ca, sex, dth30, dnr1, ninsclas, race, income, Diagnosi...
dbl (40): ptID, death, cardiohx, chfhx, dementhx, psychhx, chrpulhx, renalhx...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rhc_data |>
dim()[1] 5735 63
rhc_data |>
group_by(swang1) |>
summarise(n = n())# A tibble: 2 × 2
swang1 n
<chr> <int>
1 No RHC 3551
2 RHC 2184
The 63 variables can be divided into categories- Patient information includes general facts about the patients like race, age, gender. Socioeconomic information includes level of income, years of education, and healthcare. Fysiological stats are measurements like temperature, heart rate and blood pressure. Admission diagnosis are conditoins accounting for the reasons of the hospital visit. Comorbidities illness are diseases caused as a direct affect an underlying disease. Hospitalization information like addmission date, discharge date and date of last contact. Lastly, information about death vs. survival and date of last contact.
A statistical summary of patient information is displayed in table1. Swang1 is the attribute that shows if the patient recieved RHS or not.
T1 <- rhc_aug |>
mutate(sex = factor(sex),
swang1 = factor(swang1),
death = factor(x = death,
levels = c(0,1),
c("Alive","Dead"))) |>
table1(x = formula(~ sex + age + race + swang1 | death),
data = _)
T1| Alive (N=1972) |
Dead (N=3640) |
Overall (N=5612) |
|
|---|---|---|---|
| sex | |||
| Female | 906 (45.9%) | 1594 (43.8%) | 2500 (44.5%) |
| Male | 1066 (54.1%) | 2046 (56.2%) | 3112 (55.5%) |
| age | |||
| Mean (SD) | 56.6 (17.4) | 64.0 (15.7) | 61.4 (16.7) |
| Median [Min, Max] | 58.0 [18.0, 102] | 66.0 [18.0, 101] | 64.0 [18.0, 102] |
| race | |||
| black | 323 (16.4%) | 577 (15.9%) | 900 (16.0%) |
| other | 121 (6.1%) | 223 (6.1%) | 344 (6.1%) |
| white | 1528 (77.5%) | 2840 (78.0%) | 4368 (77.8%) |
| swang1 | |||
| 0 | 1291 (65.5%) | 2177 (59.8%) | 3468 (61.8%) |
| 1 | 681 (34.5%) | 1463 (40.2%) | 2144 (38.2%) |
The broad spectrum of attributes allows for exciting analysis of how treatment type, diseases and economic factors influence the survival rate of people.
quarto::quarto_render(input = "04_describe.qmd",
output_format = "html")[31m
processing file: 04_describe.qmd
[39m1/11
2/11 [unnamed-chunk-1]
3/11
4/11 [unnamed-chunk-2]
5/11
6/11 [unnamed-chunk-3]
7/11
8/11 [unnamed-chunk-4]
9/11
10/11 [unnamed-chunk-5]
11/11
[31moutput file: 04_describe.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 04_describe.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 04_describe.html
file.rename(from = "04_describe.html",
to = "../results/04_describe.html")[1] TRUE
Analysis 1
SOCIOECONOMIC analysis
We conducted a comprehensive socioeconomic analysis focusing on variables such as age, gender, education, race, income, and medical assurance. Plots and summary statistics explore the distribution of deaths and the application of right heart catheterization (RHC) across different subgroups of patients. The analysis provides insights into potential associations between socioeconomic factors and health outcomes.
Libraries
library(tidyverse)
library(base)
library(patchwork)
library(scales)
library(ggridges)
library(patchwork)
library(ggseqlogo)
library(dplyr)
library(table1)rhc_aug <- read_csv("../data/rhc_aug.csv")Rows: 5612 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): cat1, cat2, ca, sex, dth30, dnr1, ninsclas, race, income, Diagnosi...
dbl (40): ptID, death, cardiohx, chfhx, dementhx, psychhx, chrpulhx, renalhx...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Color Scheme: ::: {.cell}
"darkolivegreen2"[1] "darkolivegreen2"
"darkgoldenrod1"[1] "darkgoldenrod1"
"red"[1] "red"
::: The variables that we found to be interesting to analyze from a socioeconomic perspective are: ::: {.cell}
rhc_aug |>
select(age, sex, edu, race, income, ninsclas)# A tibble: 5,612 × 6
age sex edu race income ninsclas
<dbl> <chr> <dbl> <chr> <chr> <chr>
1 70 Male 12 white Under $11k Medicare
2 78 Female 12 white Under $11k Private & Medicare
3 46 Female 14 white $25-$50k Private
4 75 Female 9 white $11-$25k Private & Medicare
5 68 Male 10 white Under $11k Medicare
6 86 Female 8 white Under $11k Medicare
7 55 Male 14 white $25-$50k Private
8 44 Male 12 white $25-$50k Private
9 18 Female 13 white Under $11k Private
10 48 Female 11 white Under $11k Medicaid
# ℹ 5,602 more rows
::: For all the variables of interest, we focused on plotting the death distribution of patients and if they had been applied an RHC.
Age
Following plot provides insights into the mortality rates across different ages. ::: {.cell}
# Death in different ages
rhc_aug |>
ggplot(aes(x = age,
fill = factor(death))) +
geom_bar() +
labs(title = "Death in different ages",
x = "Age",
y = "Number of patients") +
scale_fill_manual(name = "Death",
values = c("0" = "blue", "1" = "red")) +
theme_minimal():::
Following plot provides insights into the mortality rates across different age groups. ::: {.cell}
# Death in different group ages
rhc_aug |>
ggplot(aes(x = age_group,
fill = factor(death))) +
geom_bar() +
labs(title = "Death in different ages",
x = "Age",
y = "Number of patients") +
scale_fill_manual(name = "Death",
values = c("0" = "blue", "1" = "red")) +
theme_minimal():::
The following chunk is a representation of the percentage of deaths by age group. ::: {.cell}
## Percentage of death by age
rhc_aug |>
group_by(age_group, death) |>
summarize(count = n()) |>
group_by(age_group) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(death == 1) |>
select(age_group, percent)`summarise()` has grouped output by 'age_group'. You can override using the
`.groups` argument.
# A tibble: 9 × 2
age_group percent
<chr> <dbl>
1 10-20 31.7
2 20-30 41.1
3 30-40 49.7
4 40-50 55.1
5 50-60 62.5
6 60-70 69.0
7 70-80 71.9
8 80-90 77.8
9 > 90 85
::: Several observations can be made. Firstly, it becomes apparent that mortality rates increase with advancing age. This finding aligns with existing knowledge that older individuals generally have a higher risk of experiencing adverse health outcomes. Secondly, the plot demonstrates that the number of deaths is relatively higher in certain age brackets, indicating the presence of age-specific vulnerabilities. Further research would be needed to fully understand the underlying reasons for these variations.
Following plot provides insights into the RHC appliance rates across different age groups. ::: {.cell}
# Count of RHC by age
rhc_aug |>
ggplot(aes(x = age_group,
fill = as.factor(swang1))) +
geom_bar() +
labs(title = "Distribution of Age by RHC Status",
x = "Age",
y = "Count",
fill = "RHC Status") +
scale_fill_manual(name= "RHC Status",
values = c("No RHC" = "blue", "RHC" = "red")) +
theme_minimal():::
The following chunk shows the corresponding percentage of RHC procedures for each age group. ::: {.cell}
## Percentage of RHC by age_group
rhc_aug |>
group_by(age_group, swang1) |>
summarize(count = n()) |>
group_by(age_group) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(swang1 == "RHC") |>
select(age_group, percent)`summarise()` has grouped output by 'age_group'. You can override using the
`.groups` argument.
# A tibble: 0 × 2
# ℹ 2 variables: age_group <chr>, percent <dbl>
::: This results suggests that middle-aged patients (40-50 and 60-70 age groups) may have higher rates of RHC procedures, potentially due to specific cardiac conditions or clinical indications. Conversely, patients (10-20 age group) and older (> 90 age group) may have lower rates of RHC procedures, indicating a different clinical approach or lower prevalence of conditions requiring RHC. Further research would be needed to understand the underlying reasons for these variations and fully evaluate the appropriateness of RHC procedures.
Gender
Following plot provides insights into the mortality rates across different genders. ::: {.cell}
# Death females vs. males
rhc_aug |>
ggplot(aes(x = sex,
fill = factor(death))) +
geom_bar() +
labs(title = "Death females vs. males",
x = "Gender",
y = "Number of patients") +
scale_fill_manual(name = "Death",
values = c("0" = "blue", "1" = "red")) +
theme_minimal():::
The following chunk is a representation of the percentage of deaths by gender. ::: {.cell}
# Percentage of death by gender
rhc_aug |>
group_by(sex, death) |>
summarize(count = n()) |>
group_by(sex) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(death == 1) |>
select(sex, percent)`summarise()` has grouped output by 'sex'. You can override using the `.groups`
argument.
# A tibble: 2 × 2
sex percent
<chr> <dbl>
1 Female 63.8
2 Male 65.7
::: This results suggest that there may be a slight difference in mortality rates between females and males, with males having a slightly higher percentage of deaths. However, it is important to note that the difference is relatively small and may not be statistically significant. Further research would be needed to fully understand and check the underlying reasons for these variations.
Following plot provides insights into the RHC appliance rates across different genders. ::: {.cell}
# Count of RHC by gender
rhc_aug |>
ggplot(aes(x = sex,
fill = as.factor(swang1))) +
geom_bar() +
labs(title = "Distribution of Gender by RHC Status",
x = "Gender",
y = "Number of patients",
fill = "RHC Status") +
scale_fill_manual(name= "RHC Status",
values = c("No RHC" = "blue", "RHC" = "red")) +
theme_minimal():::
The following chunk shows the corresponding percentage of RHC procedures for each gender. ::: {.cell}
## Percentage of RHC by gender
rhc_aug |>
group_by(sex, swang1) |>
summarize(count = n()) |>
group_by(sex) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(swang1 == "RHC") |>
select(sex, percent)`summarise()` has grouped output by 'sex'. You can override using the `.groups`
argument.
# A tibble: 0 × 2
# ℹ 2 variables: sex <chr>, percent <dbl>
::: This suggest a potential difference in RHC utilization between females and males, with males having a slightly higher percentage of RHC procedures. Further research would be needed to fully understand and check the underlying reasons for these variations.
Race
Following plot provides insights into the mortality rates across different races. ::: {.cell}
#Death by race
rhc_aug |>
ggplot(aes(x = race,
fill = factor(death))) +
geom_bar() +
labs(title = "Death in race",
x = "Race",
y = "Count",
fill = "Death") +
scale_fill_manual(name = "Death",
values = c("0" = "blue", "1" = "red")) +
theme_minimal():::
The following chunk is a representation of the percentage of deaths by race. ::: {.cell}
# Percentage of death by race
rhc_aug |>
group_by(race, death) |>
summarize(count = n()) |>
group_by(race) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(death == 1) |>
select(race, percent)`summarise()` has grouped output by 'race'. You can override using the
`.groups` argument.
# A tibble: 3 × 2
race percent
<chr> <dbl>
1 black 64.1
2 other 64.8
3 white 65.0
::: This suggest a potential variation in mortality rates among different racial groups, with the white population having a slightly higher percentage of deaths. Further research would be needed to fully understand and check the underlying reasons for these variations.
Following plot provides insights into the RHC appliance rates across different race. ::: {.cell}
rhc_aug |>
ggplot(aes(x = race,
fill = as.factor(swang1))) +
geom_bar() +
labs(title = "Distribution of Race by RHC Status",
x = "Race",
y = "Number of patients",
fill = "RHC Status") +
scale_fill_manual(name= "RHC Status",
values = c("No RHC" = "blue", "RHC" = "red")) +
theme_minimal():::
The following chunk shows the corresponding percentage of RHC procedures by race. ::: {.cell}
## Percentage of RHC by race
rhc_aug |>
group_by(race, swang1) |>
summarize(count = n()) |>
group_by(race) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(swang1 == "RHC") |>
select(race, percent)`summarise()` has grouped output by 'race'. You can override using the
`.groups` argument.
# A tibble: 0 × 2
# ℹ 2 variables: race <chr>, percent <dbl>
::: These findings suggest a potential variation in RHC utilization among different racial groups, with the other racial category having the highest percentage of RHC procedures. Further research would be needed to fully understand and check the underlying reasons for these variations.
Income
Following plot provides insights into the mortality rates across different patients’ incomes. ::: {.cell}
#Death by income
rhc_aug |>
ggplot(aes(x = income,
fill = factor(death))) +
geom_bar() +
labs(title = "Death by Income",
x = "Income",
y = "Number of patients",
fill = "Death") +
scale_fill_manual(name = "Death", values = c("0" = "blue", "1" = "red")) +
theme_minimal():::
The following chunk is a representation of the percentage of deaths by income. ::: {.cell}
## Percentage of death by income
rhc_aug |>
group_by(income, death) |>
summarize(count = n()) |>
group_by(income) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(death == 1) |>
select(income, percent)`summarise()` has grouped output by 'income'. You can override using the
`.groups` argument.
# A tibble: 4 × 2
income percent
<chr> <dbl>
1 $11-$25k 63.7
2 $25-$50k 58.2
3 > $50k 60.3
4 Under $11k 67.8
::: This suggest that the lowest income category have the highest percentage of deaths. However, it is important to note that the differences are relatively small and may not be statistically significant. Further research would be needed to fully understand and check the underlying reasons for these variations.
Following plot provides insights into the RHC appliance rates across different incomes. ::: {.cell}
# Count of RHC by income
rhc_aug |>
ggplot(aes(x = income,
fill = as.factor(swang1))) +
geom_bar() +
labs(title = "Distribution of Income by RHC Status",
x = "Income",
y = "Number of patients",
fill = "RHC Status") +
scale_fill_manual(name = "RHC Status",
values = c("No RHC" = "blue", "RHC" = "red")) +
theme_minimal():::
The following chunk shows the corresponding percentage of RHC procedures by income. ::: {.cell}
## Percentage of RHC by income
rhc_aug |>
group_by(income, swang1) |>
summarize(count = n()) |>
group_by(income) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(swang1 == "RHC") |>
select(income, percent)`summarise()` has grouped output by 'income'. You can override using the
`.groups` argument.
# A tibble: 0 × 2
# ℹ 2 variables: income <chr>, percent <dbl>
::: This suggests that patients with higher incomes may have better access to healthcare facilities and may be more likely to undergo diagnostic procedures like RHC. Further research would be needed to fully understand and check the underlying reasons for these variations.
Education
Following plot provides insights into the mortality rates across different education groups. ::: {.cell}
# Death in different group of education
rhc_aug |>
ggplot(aes(x = edu_group,
fill = factor(death))) +
geom_bar() +
labs(title = "Death by Education",
x = "Years of eduaction",
y = "Number of patients") +
scale_fill_manual(name = "Death",
values = c("0" = "blue", "1" = "red")) +
theme_minimal():::
The following chunk is a representation of the percentage of deaths by education. ::: {.cell}
## Percentage of death by education
rhc_aug |>
group_by(edu_group, death) |>
summarize(count = n()) |>
group_by(edu_group) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(death == 1) |>
select(edu_group, percent)`summarise()` has grouped output by 'edu_group'. You can override using the
`.groups` argument.
# A tibble: 3 × 2
edu_group percent
<chr> <dbl>
1 <10 years 67.5
2 >20 years 70
3 between 10-20 years 63.7
::: Individuals with higher education levels have slightly higher percentages of deaths. It is possible that individuals with higher education levels may have a higher likelihood of seeking medical attention and being diagnosed with underlying health conditions, which could contribute to the observed higher mortality rates. Further research would be needed to fully understand and check the underlying reasons for these variations.
Following plot provides insights into the RHC appliance rates across different education levels. ::: {.cell}
# Count of RHC by Education
rhc_aug |>
ggplot(aes((x = edu_group),
fill = as.factor(swang1))) +
geom_bar() +
labs(title = "Distribution of Education by RHC Status",
x = "Education",
y = "Number of patients",
fill = "RHC Status") +
scale_fill_manual(name = "RHC Status",
values = c("No RHC" = "blue", "RHC" = "red")) +
theme_minimal():::
The following chunk shows the corresponding percentage of RHC procedures by Education. ::: {.cell}
## Percentage of RHC by education
rhc_aug |>
group_by(edu_group, swang1) |>
summarize(count = n()) |>
group_by(edu_group) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(swang1 == "RHC") |>
select(edu_group, percent)`summarise()` has grouped output by 'edu_group'. You can override using the
`.groups` argument.
# A tibble: 0 × 2
# ℹ 2 variables: edu_group <chr>, percent <dbl>
::: Individuals with higher education levels have slightly higher percentages of RHC procedures. They may have better access to healthcare, increased health literacy, and a greater understanding of the benefits of diagnostic procedures like RHC. So healthcare providers may be more likely to recommend RHC to patients with higher education levels. Further research would be needed to fully understand and check the underlying reasons for these variations.
Medical assurance
Following plot provides insights into the mortality rates across different medical assurance. ::: {.cell}
#Death by medical assurance
rhc_aug |>
ggplot(aes(x = ninsclas,
fill = factor(death))) +
geom_bar() +
labs(title = "Death in medical assurance",
x = "Medical assurance",
y = "Number of patients",
fill = "Death") +
scale_fill_manual(name = "Death",
values = c("0" = "blue", "1" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)):::
The following chunk is a representation of the percentage of deaths by medical assurance. ::: {.cell}
## Percentage of DEATH by medical assurance
rhc_aug |>
group_by(ninsclas, death) |>
summarize(count = n()) |>
group_by(ninsclas) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(death == 1) |>
select(ninsclas, percent)`summarise()` has grouped output by 'ninsclas'. You can override using the
`.groups` argument.
# A tibble: 6 × 2
ninsclas percent
<chr> <dbl>
1 Medicaid 54.1
2 Medicare 72.4
3 Medicare & Medicaid 70.2
4 No insurance 56.9
5 Private 58.6
6 Private & Medicare 70.7
::: Individuals covered by Medicare having the highest percentage of deaths. This can be because Medicare is primarily for individuals aged 65 and older or those with certain disabilities, who may have a higher prevalence of chronic health conditions and therefore a higher risk of mortality. Further research would be needed to fully understand and check the underlying reasons for these variations.
Following plot provides insights into the RHC appliance rates across different medical assurance. ::: {.cell}
# Count of RHC by Medical Assurance
rhc_aug |>
ggplot(aes((x = ninsclas),
fill = as.factor(swang1))) +
geom_bar() +
labs(title = "Distribution of Medical assurance by RHC Status",
x = "Medical assurance",
y = "Number of patients",
fill = "RHC Status") +
scale_fill_manual(name = "RHC Status",
values = c("No RHC" = "blue", "RHC" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)):::
The following chunk shows the corresponding percentage of RHC procedures by medical assurance. ::: {.cell}
## Percentage of RHC by medical assurance
rhc_aug |>
group_by(ninsclas, swang1) |>
summarize(count = n()) |>
group_by(ninsclas) |>
mutate(percent = count / sum(count) * 100) |>
ungroup() |>
filter(swang1 == "RHC") |>
select(ninsclas, percent)`summarise()` has grouped output by 'ninsclas'. You can override using the
`.groups` argument.
# A tibble: 0 × 2
# ℹ 2 variables: ninsclas <chr>, percent <dbl>
::: Individuals without insurance and those with private insurance have higher percentages of RHC procedures. Individuals without insurance may have limited access to healthcare services and may seek RHC procedures when their condition becomes severe. On the other hand, individuals with private insurance may have better access to healthcare facilities and may be more likely to undergo diagnostic procedures like RHC. Further research would be needed to fully understand and check the underlying reasons for these variations.
Others
death_count <- rhc_aug |>
mutate(death = case_when(
death == 0 ~ "Alive",
death == 1 ~ "Dead")) |>
# filter(death == 0) |>
mutate(income = factor(income, levels = rev(levels(as.factor(income))))) |>
ggplot(aes(x = income,
fill = as.factor(death))) +
geom_bar(position = "dodge") +
labs(title = "Death Count by Income and Medical Assurance",
x = "Income",
y = "Number of patients",
fill = "Death status") +
facet_wrap(~ninsclas) +
scale_fill_manual(name = "Death", values = c( "darkolivegreen2", "red")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
death_countggsave(filename = "05_death_count_1.png",
plot = death_count,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
How many people under $11k have private and how many have Medicare?
income_medicalinsurance <- rhc_aug |>
ggplot(aes(x = income,
fill = factor(ninsclas))) +
geom_bar() +
labs(title = "Income by Medical Assurance",
x = "Income",
y = "Number of Patients",
fill = "Medical Insurance")
income_medicalinsuranceggsave(filename = "05_income_medicalinsurance_2.png",
plot = income_medicalinsurance,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
rhc_aug |>
filter(income == "Under $11k",
ninsclas %in% c("Medicare", "Private")) |>
count(income, ninsclas)# A tibble: 2 × 3
income ninsclas n
<chr> <chr> <int>
1 Under $11k Medicare 965
2 Under $11k Private 471
# Bar plot for RHC by income and race
rhc_aug |>
# filter(swang1 == 'RHC') |>
ggplot(aes(x = race,
fill = as.factor(swang1))) +
geom_bar(position = "dodge") +
labs(title = "RHC Count by Income and Race",
x = "Income",
y = "Number of patients") +
facet_wrap(~income) +
theme(axis.text.x = element_text(angle = 45,
hjust = 1)) +
theme_minimal()# Bar plot for RHC by income and medical assurance
rhc_aug |>
ggplot(aes(x = race,
fill = as.factor(swang1),
group = as.factor(income))) +
geom_bar(position = "dodge") +
labs(title = "RHC Count by Income and Medical Assurance",
x = "Income",
y = "Count",
fill = "RHC status") +
facet_wrap(~income) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Warning: The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
quarto::quarto_render(input = "05_analysis_1.qmd",
output_format = "html")[31m
processing file: 05_analysis_1.qmd
[39m1/69
2/69 [unnamed-chunk-1]
3/69
4/69 [unnamed-chunk-2]
5/69
6/69 [unnamed-chunk-3]
7/69
8/69 [unnamed-chunk-4]
9/69
10/69 [unnamed-chunk-5]
11/69
12/69 [unnamed-chunk-6]
13/69
14/69 [unnamed-chunk-7]
15/69
16/69 [unnamed-chunk-8]
17/69
18/69 [unnamed-chunk-9]
19/69
20/69 [unnamed-chunk-10]
21/69
22/69 [unnamed-chunk-11]
23/69
24/69 [unnamed-chunk-12]
25/69
26/69 [unnamed-chunk-13]
27/69
28/69 [unnamed-chunk-14]
29/69
30/69 [unnamed-chunk-15]
31/69
32/69 [unnamed-chunk-16]
33/69
34/69 [unnamed-chunk-17]
35/69
36/69 [unnamed-chunk-18]
37/69
38/69 [unnamed-chunk-19]
39/69
40/69 [unnamed-chunk-20]
41/69
42/69 [unnamed-chunk-21]
43/69
44/69 [unnamed-chunk-22]
45/69
46/69 [unnamed-chunk-23]
47/69
48/69 [unnamed-chunk-24]
49/69
50/69 [unnamed-chunk-25]
51/69
52/69 [unnamed-chunk-26]
53/69
54/69 [unnamed-chunk-27]
55/69
56/69 [unnamed-chunk-28]
57/69
58/69 [unnamed-chunk-29]
59/69
60/69 [unnamed-chunk-30]
61/69
62/69 [unnamed-chunk-31]
63/69
64/69 [unnamed-chunk-32]
65/69
66/69 [unnamed-chunk-33]
67/69
68/69 [unnamed-chunk-34]
69/69
[31moutput file: 05_analysis_1.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 05_analysis_1.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 05_analysis_1.html
file.rename(from = "05_analysis_1.html",
to = "../results/05_analysis_1.html")[1] TRUE
Analysis 2
library(tidyverse)
library(base)
library(patchwork)
library(scales)
library(ggridges)
library(patchwork)
library(ggseqlogo)
library(dplyr)
library(table1)The analysis explores the correlation between illnesses, Right Heart Catheterization (RHC) treatment, and the Acute Physiology Score (APS) concerning mortality. Various plots are generated to create an understanding of the dataset, aiming to reveal potential relationships among the mentioned attributes. These visual representations serve as a tool to discover whether specific factors are associated with death.
Load data ::: {.cell}
rhc_aug <- read_csv("../data/rhc_aug.csv")Rows: 5612 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): cat1, cat2, ca, sex, dth30, dnr1, ninsclas, race, income, Diagnosi...
dbl (40): ptID, death, cardiohx, chfhx, dementhx, psychhx, chrpulhx, renalhx...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
:::
To get a quick overview of some the variables in the dataset we can plot some very simple plots. We start out by visualizing sex against the age, and color by the death column, ::: {.cell}
rhc_aug |>
ggplot(aes(x = age,
y = sex,
color = death)) +
geom_point()::: It is now seen here that there is no apparent pattern for death based on the sex against age plot above.
Another thing that we might want visualize is the mean blood pressure against each diagnosis. Before the data was agumented the code below was used to show patient without disease as well with the legend label “0”. Though, as the agumented code data was implemented the plot below show the same as the next plot. ::: {.cell}
:::
This plot is the same as the code above with color mapping for each diagnosis. It show the mean blood pressure for each diagnosis. ::: {.cell}
rhc_aug |>
ggplot(aes(x = Diagnosis,
y = meanbp1,
fill = factor(Diagnosis))) +
geom_point(aes(color = factor(Diagnosis)),
position = position_dodge(0.8),
alpha = 0.5) +
geom_boxplot(color = "black",
alpha = 0.5) +
labs(title = "Mean blood pressure against all diagnosis",
y = "Mean blood pressure",
color = "Diagonosis") +
guides(fill = guide_legend(title = "Diagnosis Status"),
color = FALSE) +
theme(legend.key.height = unit(0.7, 'cm'),
legend.key.width = unit(0.7, 'cm'))::: Already from these two plots we see that underlying data points are split into two groups and we will take this into consideration in a bit.
This plot doesn’t make much sense with use of rhc_aug data. Therefore, we will skip this as continuing with this will lead to a dead end. ::: {.cell}
::: The plot below show the mean blood pressure for patiens across all diagnoses and whether they had RHC performed on them. So we generally see picture of patients with lower blood pressure getting RHC performed on them. ::: {.cell}
rhc_aug |>
ggplot(aes(x = Diagnosis,
y = meanbp1,
fill = factor(swang1))) +
geom_point(aes(color = as.factor(swang1)),
position = position_dodge(0.8),
alpha = 0.5) +
geom_boxplot(alpha = 0.5,
outlier.shape = NA) +
labs(title = "Mean blood pressure for all diagnoses and RHC status",
y = "Mean blood pressure",
color = "RHC status",
fill = "RHC status") +
scale_color_manual(values = c("1" = "#619CFF", "0" = "#F8766D"),
labels = c("1" = "Yes", "0" = "No")):::
In the plot below we take into account the split in data points we see underneath each violin plot. This is due to the data following a bimodal distribution which has to peaks rather than the normal distribution which has one peak. The reason why we have chosen the violin plot is to signify the bimodal distribution. ::: {.cell}
meanbp_plot <- rhc_aug |>
ggplot(aes(x = cat1,
y = meanbp1,
fill = factor(swang1))) +
geom_point(aes(color = cat1),
position = position_dodge(0.8),
alpha = 0.5) +
geom_violin(alpha = 0.5,
outlier.shape = NA) +
#geom_boxplot(alpha = 0.35,
#outlier.shape = NA) +
scale_y_continuous(breaks = c(0, 50, 100, 150, 200),
labels = c(0, 50, 100, 150, 200)) +
labs(title = "Mean Blood Preassue Against Primary Diseases",
x = "Primary Disease Category",
y = "Mean Blood Pressure",
color = "Primary Disease Category") +
guides(fill = guide_legend(title = "RHC")) +
theme(axis.text.x = element_text(angle = 30,
hjust = 1,
vjust = 1))Warning in geom_violin(alpha = 0.5, outlier.shape = NA): Ignoring unknown
parameters: `outlier.shape`
meanbp_plotWarning: Groups with fewer than two data points have been dropped.
ggsave(filename = "06_meanbp_plot_3.png",
plot = meanbp_plot,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
Warning: Groups with fewer than two data points have been dropped.
:::
This plot shows all the diseases and how many people have it = 1 or dont have the disease = 0. ::: {.cell}
subHX <- rhc_aug |> select(ends_with("hx"))
subHX <- subHX |> pivot_longer(everything(),
names_to = "variables",
values_to = "values")
subHX |> mutate(values = factor(values)) |>
ggplot(aes(x = variables,
fill = values)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 20,
hjust = 1)):::
We begin by exploring if the number of comorbidity diseases influence the risk of death. ::: {.cell}
Comorbidities_risk <- rhc_aug |> mutate(death = factor(x = death,
levels = c(0,1),
c("No","Yes"))) |>
ggplot(aes(x = comorbidities,
fill = factor(death))) +
geom_bar(position = "stack") +
labs(title = "Risk of Dying by Number of Comorbidities",
x = "Number of Comorbidities",
y = "Number of Patients") +
scale_fill_manual(name = "Death",
values = c("No" = "darkolivegreen2",
"Yes" = "red"))
ggsave(filename = "06_Comorbidities_risk_4.png",
plot = Comorbidities_risk,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
:::
Now we want to include RHC treatment to see if this, together with comorbidities will influence the death rate. ::: {.cell}
rhc_aug |>
mutate(swang1 = factor(x = swang1,
levels = c(0,1),
c("no RHC","RHC"))) |>
ggplot(aes(x = comorbidities,
fill = factor(death))) +
facet_wrap(~swang1)+
geom_bar(position = "stack") +
labs(title = "Risk of dying by number of comorbidities",
x = "Number of comorbidities",
y = "Number of patients") +
scale_fill_manual(name = "Survival",
values = c("0" = "#619CFF", "1" = "#F8766D"),
labels = c('Dead', 'Alive')) +
theme_minimal():::
This plot shows all individuals who survided and died with and without the RHC treatment. ::: {.cell}
rhc_aug |>
mutate(swang1 = factor(x = swang1,
levels = c(0,1),
c("no RHC","RHC")),
death = factor(x = death,
levels = c(0,1),
c("alive","dead"))) |>
count(death,
swang1) |>
ggplot(aes(x = swang1,
y = n,
fill = death))+
geom_col(alpha = 1,
color="pink") +
labs(x = 'Treatment type',
y = 'n') +
theme(axis.text.x = element_text(angle = 20,
hjust = 1)):::
We investigate how many survive and die depending on their APS score. APS score predicts death by taking both laberatory values and patient signs into account. ::: {.cell}
rhc_aug |>
ggplot(aes(x = aps1,
fill = factor(death))) +
geom_bar(position = "stack") ::: And then we make the x-axis an interval to bin the observations. ::: {.cell}
rhc_aug |>
ggplot(aes(x = aps1_Interval,
fill=factor(death))) +
geom_bar(position = "stack") :::
quarto::quarto_render(input = "06_analysis_2.qmd",
output_format = "html")[31m
processing file: 06_analysis_2.qmd
[39m1/28 [unnamed-chunk-1]
2/28
3/28 [unnamed-chunk-2]
4/28
5/28 [plot-sex-against-age]
6/28
7/28 [meanbp-against-diagnosis-non-filtered]
8/28
9/28 [meanbp-across-diagnosis-filtered]
10/28
11/28 [meanbp-diagnosis-separated-RHC]
12/28
13/28 [meanbp-RHC-based-on-diagnosis]
14/28
15/28 [meanbp-RHC-based-on-primary-disease]
16/28
17/28 [unnamed-chunk-9]
18/28
19/28 [unnamed-chunk-10]
20/28
21/28 [unnamed-chunk-11]
22/28
23/28 [unnamed-chunk-12]
24/28
25/28 [unnamed-chunk-13]
26/28
27/28 [unnamed-chunk-14]
28/28
[31moutput file: 06_analysis_2.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 06_analysis_2.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 06_analysis_2.html
file.rename(from = "06_analysis_2.html",
to = "../results/06_analysis_2.html")[1] TRUE
PCA
library(tidyverse)
library(broom)
library(purrr)
library(ggrepel)This file works with analysing the dataset using PCA.
Load data ::: {.cell}
rhc_aug <- read_csv("../data/rhc_aug.csv")Rows: 5612 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): cat1, cat2, ca, sex, dth30, dnr1, ninsclas, race, income, Diagnosi...
dbl (40): ptID, death, cardiohx, chfhx, dementhx, psychhx, chrpulhx, renalhx...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
:::
PCA analysis
We change the swang1 variable to 1 if they had RHC performed and 0 if they did not. Furthermore, we choose the variables we wanted to investigate. Then we used the command prcomp, which, admittedly is a bit of a black box function. It performs a principal component analysis on our data, and also standardizes the data.
rhc_for_pca <- rhc_aug |>
select(c(death, swang1, age, edu,
meanbp1, resp1, hrt1,
crea1, pot1, surv2md1)) |>
mutate(death = case_when(
death == 0 ~ "Alive",
death == 1 ~ "Dead"))
rhc_for_pca# A tibble: 5,612 × 10
death swang1 age edu meanbp1 resp1 hrt1 crea1 pot1 surv2md1
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alive 0 70 12 41 10 124 1.20 4 0.641
2 Dead 1 78 12 63 38 137 0.600 3.30 0.755
3 Alive 1 46 14 57 40 130 2.60 2.90 0.317
4 Dead 0 75 9 55 26 58 1.70 5.80 0.441
5 Dead 1 68 10 65 27 125 3.60 5.80 0.437
6 Alive 0 86 8 115 36 134 1.40 5.40 0.665
7 Alive 0 55 14 67 10 135 1 3.70 0.339
8 Dead 0 44 12 128 34 102 0.700 4.10 0.632
9 Alive 0 18 13 53 30 118 1.70 3.10 0.503
10 Alive 1 48 11 73 40 141 0.5 3.10 0.669
# ℹ 5,602 more rows
pca_fit <- rhc_for_pca |>
select(!death) |>
prcomp(scale = TRUE)
pca_fitStandard deviations (1, .., p=9):
[1] 1.3008328 1.1548351 1.1009700 1.0452214 0.9804279 0.8824818 0.8359159
[8] 0.8127687 0.7551198
Rotation (n x k) = (9 x 9):
PC1 PC2 PC3 PC4 PC5
swang1 0.21249427 0.47467627 0.14410101 0.32839947 -0.47491194
age 0.38712508 -0.35938905 0.27038915 -0.34758975 0.03159270
edu -0.09347023 0.35155287 0.04847525 0.32997453 0.79700042
meanbp1 -0.39980798 -0.36633320 -0.27541150 -0.02301811 -0.01838871
resp1 -0.28672608 0.29511353 0.01050469 -0.66001159 0.11183749
hrt1 -0.37123149 0.45045299 0.11626738 -0.30771256 -0.21102262
crea1 0.33211193 0.27027635 -0.54918178 -0.06077639 -0.10752849
pot1 0.31817258 0.08183601 -0.59828089 -0.23407798 0.16662071
surv2md1 -0.45259304 -0.13411257 -0.39338805 0.27043212 -0.20383751
PC6 PC7 PC8 PC9
swang1 0.32529507 -0.45925249 -0.03058807 -0.2402303
age 0.42724831 -0.33624273 -0.01301092 0.4798719
edu 0.28119860 -0.13520477 0.01910790 0.1533398
meanbp1 0.68769393 0.04516638 -0.00142545 -0.3927711
resp1 -0.11742809 -0.41070254 0.39399948 -0.2115688
hrt1 0.24296918 0.33668519 -0.44423598 0.3698744
crea1 0.23188453 0.32919583 0.52668165 0.2454826
pot1 -0.08404208 -0.23789265 -0.60521645 -0.1470753
surv2md1 -0.16121265 -0.45684937 0.04760497 0.5203545
I plot the coordinates of the first and second PCs, as well as the result of the different observations. ::: {.cell}
rhc_for_pca# A tibble: 5,612 × 10
death swang1 age edu meanbp1 resp1 hrt1 crea1 pot1 surv2md1
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alive 0 70 12 41 10 124 1.20 4 0.641
2 Dead 1 78 12 63 38 137 0.600 3.30 0.755
3 Alive 1 46 14 57 40 130 2.60 2.90 0.317
4 Dead 0 75 9 55 26 58 1.70 5.80 0.441
5 Dead 1 68 10 65 27 125 3.60 5.80 0.437
6 Alive 0 86 8 115 36 134 1.40 5.40 0.665
7 Alive 0 55 14 67 10 135 1 3.70 0.339
8 Dead 0 44 12 128 34 102 0.700 4.10 0.632
9 Alive 0 18 13 53 30 118 1.70 3.10 0.503
10 Alive 1 48 11 73 40 141 0.5 3.10 0.669
# ℹ 5,602 more rows
coordinates <- pca_fit |>
augment(rhc_for_pca) |> # add original dataset back in
ggplot(aes(.fittedPC1,
.fittedPC2,
color = death),
alpha = 0.5) +
geom_point(size = 1.5) +
scale_color_manual(values = c(Dead = "darkred",
Alive = "darkolivegreen2"),
) +
labs(
x = "Principal Component 1",
y = "Principal Component 2",
title = "Observations Plotted on First and Second Principal Components"
)
coordinatesggsave(filename = "07_result_plotted_on_coordinates_5.png",
plot = coordinates,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
:::
The Eigenvalues are investigated, as well as how much of the variance each of the PCs explain. Then explanation of each PC is plotted as well as the accumulated explanation. ::: {.cell}
pca_fit |>
tidy(matrix = "eigenvalues")# A tibble: 9 × 4
PC std.dev percent cumulative
<dbl> <dbl> <dbl> <dbl>
1 1 1.30 0.188 0.188
2 2 1.15 0.148 0.336
3 3 1.10 0.135 0.471
4 4 1.05 0.121 0.592
5 5 0.980 0.107 0.699
6 6 0.882 0.0865 0.786
7 7 0.836 0.0776 0.863
8 8 0.813 0.0734 0.937
9 9 0.755 0.0634 1
pca_fit |>
tidy(matrix = "eigenvalues")# A tibble: 9 × 4
PC std.dev percent cumulative
<dbl> <dbl> <dbl> <dbl>
1 1 1.30 0.188 0.188
2 2 1.15 0.148 0.336
3 3 1.10 0.135 0.471
4 4 1.05 0.121 0.592
5 5 0.980 0.107 0.699
6 6 0.882 0.0865 0.786
7 7 0.836 0.0776 0.863
8 8 0.813 0.0734 0.937
9 9 0.755 0.0634 1
transformation_coefficient <- 1/0.18768
transformation_coefficient[1] 5.328218
pca_fit |>
tidy(matrix = "eigenvalues") |>
select(cumulative) |>
sum()[1] 5.87193
p1 <- pca_fit |>
tidy(matrix = "eigenvalues") |>
ggplot(aes(PC, percent)) +
geom_col(fill = "darkolivegreen2") +
geom_line(aes(y = cumulative / transformation_coefficient),
color = "darkgoldenrod1") +
geom_point(aes(y = cumulative / transformation_coefficient),
color = "darkgoldenrod1") +
geom_hline(aes(yintercept = 0.8 / transformation_coefficient),
linetype = 2,
color = "red") +
scale_x_continuous(breaks = 1:9) +
scale_y_continuous(labels = scales::percent_format(),
expand = expansion(mult = c(0, 0.01)),
sec.axis = sec_axis(trans = ~.*transformation_coefficient,
name = "Cumulative Variance Explained",
labels = scales::percent_format())) +
annotate("text",
label = "PCs needed : 7",
x = 7,
y = 0.125,
color = "darkred") +
labs(y = "Percent Variance Explained by Each PC",
title = "Variance explained by Principal Components")
p1ggsave(filename = "07_PCA_explained_6.png",
plot = p1,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
:::
I take a look at the rotation matrix ::: {.cell}
pca_fit |>
tidy(matrix = "rotation")# A tibble: 81 × 3
column PC value
<chr> <dbl> <dbl>
1 swang1 1 0.212
2 swang1 2 0.475
3 swang1 3 0.144
4 swang1 4 0.328
5 swang1 5 -0.475
6 swang1 6 0.325
7 swang1 7 -0.459
8 swang1 8 -0.0306
9 swang1 9 -0.240
10 age 1 0.387
# ℹ 71 more rows
:::
The influence of each variable on the first and second PC is plotted. ::: {.cell}
arrow_style <- arrow(angle = 20,
ends = "first",
type = "closed",
length = grid::unit(8, "pt"))
# plot rotation matrix
pca_fit |>
tidy(matrix = "rotation") |>
pivot_wider(names_from = "PC",
names_prefix = "PC",
values_from = "value") |>
ggplot(aes(PC1, PC2)) +
geom_segment(xend = 0,
yend = 0,
arrow = arrow_style) +
geom_text(aes(label = column),
hjust = 1, nudge_x = -0.02,
color = "#904C2F") +
xlim(-1.25, .5) + ylim(-.5, 1) +
coord_fixed() # fix aspect ratio to 1:1:::
quarto::quarto_render(input = "07_PCA.qmd",
output_format = "html")[31m
processing file: 07_PCA.qmd
[39m1/14 [unnamed-chunk-1]
2/14
3/14 [unnamed-chunk-2]
4/14
5/14 [unnamed-chunk-3]
6/14
7/14 [unnamed-chunk-4]
8/14
9/14 [unnamed-chunk-5]
10/14
11/14 [unnamed-chunk-6]
12/14
13/14 [unnamed-chunk-7]
14/14
[31moutput file: 07_PCA.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 07_PCA.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 07_PCA.html
file.rename(from = "07_PCA.html",
to = "../results/07_PCA.html")[1] TRUE
Logistic Modelling
Logistic modelling
The Aps1 is the APACHE score, which is a score given to patients in the ICU based on their vital scores, age and overall health, this score estimates the mortality of the patient based on how ill they are.
Usually this ranges from 0-71, but in this data set the min and max are 3-147, this is probably because the scoring system has been changed since the data was created.
We wish to model what scores indicates death and what scores indicate you will survive for the APACHE score.
As the value death is a binary value we choose the logistic binomial model to try and fit the data.
library(tidyverse)
library(base)
library(patchwork)
library(scales)
library(ggridges)
library(patchwork)
library(ggseqlogo)
library(dplyr)
library(table1)
library(broom)
library(purrr)Load data ::: {.cell}
rhc_aug <- read_csv("../data/rhc_aug.csv")Rows: 5612 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): cat1, cat2, ca, sex, dth30, dnr1, ninsclas, race, income, Diagnosi...
dbl (40): ptID, death, cardiohx, chfhx, dementhx, psychhx, chrpulhx, renalhx...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
:::
First we see if there is a general pattern in the mean values of the APACHE scores.
#Means of aps1 for each dignosis
rhc_aug |>
group_by(Diagnosis, death) |>
summarise(mu=mean(aps1))`summarise()` has grouped output by 'Diagnosis'. You can override using the
`.groups` argument.
# A tibble: 21 × 3
# Groups: Diagnosis [11]
Diagnosis death mu
<chr> <dbl> <dbl>
1 card 0 45.1
2 card 1 53.6
3 gastr 0 50.7
4 gastr 1 59.8
5 hema 0 58.7
6 hema 1 66.9
7 meta 0 52.3
8 meta 1 59.5
9 multiple diagnosis 0 55.5
10 multiple diagnosis 1 62.2
# ℹ 11 more rows
In all cases it is clear that the means of the APACHE scores are higher for those who end up dying.
Model of one disease
First we try to do the prediction based on one of the diseases that was reason for admission (card).
#Model of outcome of death based on aps1 for card patients.
aps1_card<-rhc_aug |>
filter(Diagnosis=='card')
aps1_card_model <- glm(formula = death ~ aps1,
data = aps1_card,
family='binomial')
aps1_card_model
Call: glm(formula = death ~ aps1, family = "binomial", data = aps1_card)
Coefficients:
(Intercept) aps1
-0.75407 0.02571
Degrees of Freedom: 1203 Total (i.e. Null); 1202 Residual
Null Deviance: 1593
Residual Deviance: 1533 AIC: 1537
The intercept means that each time the APACHE score increases by one, the chance of dying increases with 1/(1 + exp(-(-0.75407 + 0.02571 * x))).
An example is that for an APACHE score of 71 the risk of dying of cardiac disease is 74%.
aps1_card_model |>
summary()
Call:
glm(formula = death ~ aps1, family = "binomial", data = aps1_card)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.754069 0.177722 -4.243 2.21e-05 ***
aps1 0.025712 0.003472 7.406 1.30e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1592.5 on 1203 degrees of freedom
Residual deviance: 1532.8 on 1202 degrees of freedom
AIC: 1536.8
Number of Fisher Scoring iterations: 4
It seems that it is statistically significant.
Model all the diagnoses
We then choose to model the outcome for all the different admission diagnoses to see if we can predict the outcome of death by the APACHE score.
#Create nested data.
aps1_data_nested <- rhc_aug |>
group_by(Diagnosis) |>
nest() |>
ungroup()
aps1_data_nested# A tibble: 11 × 2
Diagnosis data
<chr> <list>
1 multiple diagnosis <tibble [1,630 × 52]>
2 seps <tibble [411 × 52]>
3 card <tibble [1,204 × 52]>
4 resp <tibble [1,150 × 52]>
5 renal <tibble [50 × 52]>
6 hema <tibble [55 × 52]>
7 gastr <tibble [565 × 52]>
8 trauma <tibble [42 × 52]>
9 neuro <tibble [445 × 52]>
10 meta <tibble [57 × 52]>
11 ortho <tibble [3 × 52]>
#View one of them to test
aps1_data_nested |>
filter(Diagnosis == "card") |>
pull()[[1]]
# A tibble: 1,204 × 52
ptID cat1 cat2 ca death cardiohx chfhx dementhx psychhx chrpulhx
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 3 MOSF w/Mali… MOSF… Yes 0 0 0 0 0 0
2 5 MOSF w/Seps… <NA> No 1 0 0 0 0 0
3 12 ARF <NA> No 0 0 0 0 0 0
4 13 MOSF w/Seps… <NA> No 1 1 0 0 0 0
5 15 ARF <NA> No 0 0 0 0 0 0
6 23 CHF <NA> No 1 0 1 0 0 0
7 25 CHF <NA> No 1 1 1 0 0 0
8 26 ARF <NA> No 0 0 0 0 0 0
9 29 Coma <NA> No 1 0 1 0 0 1
10 32 CHF <NA> No 0 1 1 0 0 0
# ℹ 1,194 more rows
# ℹ 42 more variables: renalhx <dbl>, liverhx <dbl>, gibledhx <dbl>,
# malighx <dbl>, immunhx <dbl>, transhx <dbl>, amihx <dbl>, age <dbl>,
# sex <chr>, edu <dbl>, surv2md1 <dbl>, das2d3pc <dbl>, t3d30 <dbl>,
# dth30 <chr>, aps1 <dbl>, scoma1 <dbl>, meanbp1 <dbl>, wblc1 <dbl>,
# hrt1 <dbl>, resp1 <dbl>, temp1 <dbl>, pafi1 <dbl>, alb1 <dbl>, hema1 <dbl>,
# bili1 <dbl>, crea1 <dbl>, sod1 <dbl>, pot1 <dbl>, paco21 <dbl>, …
#Model of outcome of death based on aps1 for all diseases.
aps1_data_nested <- aps1_data_nested |>
mutate(model_object = map(.x = data,
.f = ~glm(formula = death ~ aps1, data = .x,
family='binomial')))
aps1_data_nested# A tibble: 11 × 3
Diagnosis data model_object
<chr> <list> <list>
1 multiple diagnosis <tibble [1,630 × 52]> <glm>
2 seps <tibble [411 × 52]> <glm>
3 card <tibble [1,204 × 52]> <glm>
4 resp <tibble [1,150 × 52]> <glm>
5 renal <tibble [50 × 52]> <glm>
6 hema <tibble [55 × 52]> <glm>
7 gastr <tibble [565 × 52]> <glm>
8 trauma <tibble [42 × 52]> <glm>
9 neuro <tibble [445 × 52]> <glm>
10 meta <tibble [57 × 52]> <glm>
11 ortho <tibble [3 × 52]> <glm>
#View the one for card diagnosis
aps1_data_nested |>
filter(Diagnosis == "card") |>
pull(model_object) |>
pluck(1) |>
tidy(conf.int = TRUE,
conf.level = 0.95)# A tibble: 2 × 7
term estimate std.error statistic p.value conf.low conf.high
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.754 0.178 -4.24 2.21e- 5 -1.11 -0.408
2 aps1 0.0257 0.00347 7.41 1.30e-13 0.0190 0.0326
We exclude the ortho diagnosis as we earlier in other analysis saw, it only had 3 data points, which is probaly also the reason why this model fails on this value.
#Summerizes the model
aps1_data_nested <- aps1_data_nested |>
group_by(Diagnosis) |>
filter(Diagnosis != "ortho") |>
mutate(model_object_tidy = map(.x = model_object,
.f = ~tidy(conf.int = TRUE,
conf.level = 0.95,
x = .x)))
aps1_data_nested# A tibble: 10 × 4
# Groups: Diagnosis [10]
Diagnosis data model_object model_object_tidy
<chr> <list> <list> <list>
1 multiple diagnosis <tibble [1,630 × 52]> <glm> <tibble [2 × 7]>
2 seps <tibble [411 × 52]> <glm> <tibble [2 × 7]>
3 card <tibble [1,204 × 52]> <glm> <tibble [2 × 7]>
4 resp <tibble [1,150 × 52]> <glm> <tibble [2 × 7]>
5 renal <tibble [50 × 52]> <glm> <tibble [2 × 7]>
6 hema <tibble [55 × 52]> <glm> <tibble [2 × 7]>
7 gastr <tibble [565 × 52]> <glm> <tibble [2 × 7]>
8 trauma <tibble [42 × 52]> <glm> <tibble [2 × 7]>
9 neuro <tibble [445 × 52]> <glm> <tibble [2 × 7]>
10 meta <tibble [57 × 52]> <glm> <tibble [2 × 7]>
#Unnest the summerized values
aps1_estimates <- aps1_data_nested |>
unnest(cols = model_object_tidy)
aps1_estimates# A tibble: 20 × 10
# Groups: Diagnosis [10]
Diagnosis data model_object term estimate std.error statistic p.value
<chr> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
1 multiple d… <tibble> <glm> (Int… -0.163 0.178 -0.916 3.59e- 1
2 multiple d… <tibble> <glm> aps1 0.0180 0.00295 6.10 1.07e- 9
3 seps <tibble> <glm> (Int… -1.30 0.425 -3.06 2.22e- 3
4 seps <tibble> <glm> aps1 0.0292 0.00637 4.59 4.51e- 6
5 card <tibble> <glm> (Int… -0.754 0.178 -4.24 2.21e- 5
6 card <tibble> <glm> aps1 0.0257 0.00347 7.41 1.30e-13
7 resp <tibble> <glm> (Int… -0.871 0.208 -4.18 2.90e- 5
8 resp <tibble> <glm> aps1 0.0240 0.00391 6.15 7.92e-10
9 renal <tibble> <glm> (Int… 0.429 1.05 0.407 6.84e- 1
10 renal <tibble> <glm> aps1 0.00959 0.0158 0.606 5.45e- 1
11 hema <tibble> <glm> (Int… 0.185 1.07 0.173 8.63e- 1
12 hema <tibble> <glm> aps1 0.0191 0.0168 1.14 2.55e- 1
13 gastr <tibble> <glm> (Int… -0.771 0.297 -2.60 9.37e- 3
14 gastr <tibble> <glm> aps1 0.0272 0.00528 5.15 2.61e- 7
15 trauma <tibble> <glm> (Int… -1.56 1.07 -1.45 1.47e- 1
16 trauma <tibble> <glm> aps1 0.0170 0.0198 0.857 3.91e- 1
17 neuro <tibble> <glm> (Int… -0.223 0.243 -0.919 3.58e- 1
18 neuro <tibble> <glm> aps1 0.0183 0.00631 2.90 3.70e- 3
19 meta <tibble> <glm> (Int… -0.552 0.760 -0.726 4.68e- 1
20 meta <tibble> <glm> aps1 0.0156 0.0129 1.21 2.26e- 1
# ℹ 2 more variables: conf.low <dbl>, conf.high <dbl>
#Creates new dataframe with relevant columns for further analysis.
aps1_estimates <- aps1_estimates |>
select(c(term,
Diagnosis,
p.value,
estimate,
conf.low,
conf.high))
aps1_estimates# A tibble: 20 × 6
# Groups: Diagnosis [10]
term Diagnosis p.value estimate conf.low conf.high
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) multiple diagnosis 3.59e- 1 -0.163 -0.513 0.185
2 aps1 multiple diagnosis 1.07e- 9 0.0180 0.0123 0.0238
3 (Intercept) seps 2.22e- 3 -1.30 -2.15 -0.480
4 aps1 seps 4.51e- 6 0.0292 0.0170 0.0421
5 (Intercept) card 2.21e- 5 -0.754 -1.11 -0.408
6 aps1 card 1.30e-13 0.0257 0.0190 0.0326
7 (Intercept) resp 2.90e- 5 -0.871 -1.28 -0.466
8 aps1 resp 7.92e-10 0.0240 0.0165 0.0318
9 (Intercept) renal 6.84e- 1 0.429 -1.65 2.56
10 aps1 renal 5.45e- 1 0.00959 -0.0210 0.0422
11 (Intercept) hema 8.63e- 1 0.185 -1.94 2.35
12 aps1 hema 2.55e- 1 0.0191 -0.0128 0.0544
13 (Intercept) gastr 9.37e- 3 -0.771 -1.36 -0.195
14 aps1 gastr 2.61e- 7 0.0272 0.0171 0.0378
15 (Intercept) trauma 1.47e- 1 -1.56 -3.79 0.489
16 aps1 trauma 3.91e- 1 0.0170 -0.0219 0.0574
17 (Intercept) neuro 3.58e- 1 -0.223 -0.704 0.250
18 aps1 neuro 3.70e- 3 0.0183 0.00616 0.0309
19 (Intercept) meta 4.68e- 1 -0.552 -2.09 0.925
20 aps1 meta 2.26e- 1 0.0156 -0.00895 0.0423
#Add significance determination value
aps1_estimates <- aps1_estimates |>
mutate(q.value = map_dbl(.x=p.value,
.f = ~p.adjust(.x,
method = 'fdr'))) |>
mutate(is_significant = case_when(q.value > 0.05 ~ 'No',
q.value <= 0.05 ~ 'Yes'))
aps1_estimates# A tibble: 20 × 8
# Groups: Diagnosis [10]
term Diagnosis p.value estimate conf.low conf.high q.value is_significant
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 (Inte… multiple… 3.59e- 1 -0.163 -0.513 0.185 3.59e- 1 No
2 aps1 multiple… 1.07e- 9 0.0180 0.0123 0.0238 1.07e- 9 Yes
3 (Inte… seps 2.22e- 3 -1.30 -2.15 -0.480 2.22e- 3 Yes
4 aps1 seps 4.51e- 6 0.0292 0.0170 0.0421 4.51e- 6 Yes
5 (Inte… card 2.21e- 5 -0.754 -1.11 -0.408 2.21e- 5 Yes
6 aps1 card 1.30e-13 0.0257 0.0190 0.0326 1.30e-13 Yes
7 (Inte… resp 2.90e- 5 -0.871 -1.28 -0.466 2.90e- 5 Yes
8 aps1 resp 7.92e-10 0.0240 0.0165 0.0318 7.92e-10 Yes
9 (Inte… renal 6.84e- 1 0.429 -1.65 2.56 6.84e- 1 No
10 aps1 renal 5.45e- 1 0.00959 -0.0210 0.0422 5.45e- 1 No
11 (Inte… hema 8.63e- 1 0.185 -1.94 2.35 8.63e- 1 No
12 aps1 hema 2.55e- 1 0.0191 -0.0128 0.0544 2.55e- 1 No
13 (Inte… gastr 9.37e- 3 -0.771 -1.36 -0.195 9.37e- 3 Yes
14 aps1 gastr 2.61e- 7 0.0272 0.0171 0.0378 2.61e- 7 Yes
15 (Inte… trauma 1.47e- 1 -1.56 -3.79 0.489 1.47e- 1 No
16 aps1 trauma 3.91e- 1 0.0170 -0.0219 0.0574 3.91e- 1 No
17 (Inte… neuro 3.58e- 1 -0.223 -0.704 0.250 3.58e- 1 No
18 aps1 neuro 3.70e- 3 0.0183 0.00616 0.0309 3.70e- 3 Yes
19 (Inte… meta 4.68e- 1 -0.552 -2.09 0.925 4.68e- 1 No
20 aps1 meta 2.26e- 1 0.0156 -0.00895 0.0423 2.26e- 1 No
Plot of the model estimate aps1:
#Plots of the estimated variables
model_estimates <- aps1_estimates |>
ggplot(mapping = aes(x = estimate,
y = fct_reorder(Diagnosis,
estimate))) +
facet_wrap(~term, scales = "free_x")+
geom_point(aes(color=is_significant)) +
geom_errorbarh(aes(xmin = conf.low,
xmax = conf.high, color=is_significant)) +
geom_vline(xintercept = 0) +
scale_color_manual(
values = c(No = "darkred", Yes = "darkgoldenrod1"),
) +
labs(x='Estimates(95% CIs)',
y='Diseases',
title = 'The APACHE scores contribution to the prediction of death',
color = "Significant")
ggsave(filename = "08_model_estimates_7.png",
plot = model_estimates,
device = "png",
path = "../results",
scale = 1,
limitsize = TRUE)Saving 7 x 5 in image
model_estimatesThe higher the estimate of the slope , the higher the risk of dying is by each rise in the APACHE score.
Plot of the model estimates intercept:
Table with relevant model values for presentation.
#Table of relevant estimates
intercepts<-aps1_estimates |>
filter(term == "(Intercept)") |>
select(Diagnosis, estimate, p.value)
coefficients<-aps1_estimates |>
filter(term == "aps1") |>
select(Diagnosis, estimate)
model_variables<-
inner_join(coefficients, intercepts, by = "Diagnosis") |>
rename(Intercept=estimate.y) |>
rename(Coefficient=estimate.x) |>
ungroup()
model_variables# A tibble: 10 × 4
Diagnosis Coefficient Intercept p.value
<chr> <dbl> <dbl> <dbl>
1 multiple diagnosis 0.0180 -0.163 0.359
2 seps 0.0292 -1.30 0.00222
3 card 0.0257 -0.754 0.0000221
4 resp 0.0240 -0.871 0.0000290
5 renal 0.00959 0.429 0.684
6 hema 0.0191 0.185 0.863
7 gastr 0.0272 -0.771 0.00937
8 trauma 0.0170 -1.56 0.147
9 neuro 0.0183 -0.223 0.358
10 meta 0.0156 -0.552 0.468
quarto::quarto_render(input = "08_logistic_modelling.qmd",
output_format = "html")[31m
processing file: 08_logistic_modelling.qmd
[39m1/31
2/31 [unnamed-chunk-1]
3/31
4/31 [unnamed-chunk-2]
5/31
6/31 [unnamed-chunk-3]
7/31
8/31 [unnamed-chunk-4]
9/31
10/31 [unnamed-chunk-5]
11/31
12/31 [unnamed-chunk-6]
13/31
14/31 [unnamed-chunk-7]
15/31
16/31 [unnamed-chunk-8]
17/31
18/31 [unnamed-chunk-9]
19/31
20/31 [unnamed-chunk-10]
21/31
22/31 [unnamed-chunk-11]
23/31
24/31 [unnamed-chunk-12]
25/31
26/31 [unnamed-chunk-13]
27/31
28/31 [unnamed-chunk-14]
29/31
30/31 [unnamed-chunk-15]
31/31
[31moutput file: 08_logistic_modelling.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 08_logistic_modelling.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
title: Logistic modelling
editor: visual
Output created: 08_logistic_modelling.html
file.rename(from = "08_logistic_modelling.html",
to = "../results/08_logistic_modelling.html")[1] TRUE
Functions
#| label: function-for-words-to-binary
#' Add together two numbers. To be used in a mutate function, when wanting to convert several columns to binary.
#' The mutate function contains the data that is to be converted, thus the only input is the column name.
#'
#' @param x The column name
#' @returns A binarized column
#' @examples
#' rhc_clean_long <- rhc_clean |>
#' mutate(across(resp:ortho,
#' ~create_binary_column(.)))
#'
#'
create_binary_column <- function(x) {
ifelse(x == "Yes", 1,
ifelse(x == "No", 0, x))
}quarto::quarto_render(input = "99_functions.qmd",
output_format = "html")[31m
processing file: 99_functions.qmd
[39m1/2 [unnamed-chunk-1]
2/2
[31moutput file: 99_functions.knit.md
[39m[31mWarning message:
In png(..., res = dpi, units = "in") :
unable to open connection to X11 display ''
[39m[1mpandoc [22m
to: html
output-file: 99_functions.html
standalone: true
section-divs: true
html-math-method: mathjax
wrap: none
default-image-extension: png
[1mmetadata[22m
document-css: false
link-citations: true
date-format: long
lang: en
Output created: 99_functions.html
file.rename(from = "99_functions.html",
to = "../results/99_functions.html")[1] TRUE